In [5]:
import pandas as pd
data = pd.read_csv('Accidental_Drug_Related_Deaths_2012-2023.csv')
data.head()
Out[5]:
Date Date Type Age Sex Race Ethnicity Residence City Residence County Residence State Injury City ... Xylazine Gabapentin Opiate NOS Heroin/Morph/Codeine Other Opioid Any Opioid Other ResidenceCityGeo InjuryCityGeo DeathCityGeo
0 05/29/2012 Date of death 37.0 Male Black NaN STAMFORD FAIRFIELD NaN STAMFORD ... NaN NaN NaN NaN NaN NaN NaN STAMFORD, CT\n(41.051924, -73.539475) STAMFORD, CT\n(41.051924, -73.539475) CT\n(41.575155, -72.738288)
1 06/27/2012 Date of death 37.0 Male White NaN NORWICH NEW LONDON NaN NORWICH ... NaN NaN NaN NaN NaN NaN NaN NORWICH, CT\n(41.524304, -72.075821) NORWICH, CT\n(41.524304, -72.075821) Norwich, CT\n(41.524304, -72.075821)
2 03/24/2014 Date of death 28.0 Male White NaN HEBRON NaN NaN HEBRON ... NaN NaN NaN NaN NaN NaN NaN HEBRON, CT\n(41.658069, -72.366324) HEBRON, CT\n(41.658069, -72.366324) Marlborough, CT\n(41.632043, -72.461309)
3 12/31/2014 Date of death 26.0 Female White NaN BALTIC NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN BALTIC, CT\n(41.617221, -72.085031) CT\n(41.575155, -72.738288) Baltic, CT\n(41.617221, -72.085031)
4 01/16/2016 Date of death 41.0 Male White NaN SHELTON FAIRFIELD CT SHELTON ... NaN NaN NaN NaN NaN Y NaN SHELTON, CT\n(41.316843, -73.092968) SHELTON, CT\n(41.316843, -73.092968) Bridgeport, CT\n(41.179195, -73.189476)

5 rows × 48 columns

In [6]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('Accidental_Drug_Related_Deaths_2012-2023.csv')

# Convert 'Sex' column: Male -> 0, Female -> 1
data['Sex'] = data['Sex'].map({'Male': 0, 'Female': 1})

# Convert 'Any Opioid' column: Replace NaN/Null with 'No', and 'Yes' stays as is
data['Any Opioid'] = data['Any Opioid'].fillna('No')  # Replace NaN with 'No'

# Map 'Yes' to 1 and 'No' to 0 for 'Any Opioid'
data['Any Opioid'] = data['Any Opioid'].map({'Y': 1, 'No': 0})

# Show the unique values of 'Any Opioid'
print(data['Any Opioid'].value_counts())

# Prepare the features and target
X = data[['Age', 'Sex']]  # Features (Age and Sex)
y = data['Any Opioid']    # Target (Any Opioid)

# Before proceeding, ensure 'Any Opioid' doesn't have NaN values
print("\nBefore handling NaN in 'Any Opioid':")
print(y.isna().sum())  # Display the number of NaN values in the target column

# Handle NaN values in the 'Any Opioid' column (Replace NaN with 0 or 'No')
y = y.fillna(0)  # Fill NaN with 0 to indicate 'No'

# Verify that NaN values were handled correctly
print("\nAfter handling NaN in 'Any Opioid':")
print(y.isna().sum())  # Ensure there are no NaN values left in the target column

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Record the start time for measuring execution time
start_time = time.time()

# Initialize the Random Forest Classifier with 50 estimators (trees)
clf = RandomForestClassifier(n_estimators=50)

# Train the model using the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Record the end time to calculate the execution time
end_time = time.time()

# Calculate the execution time
exe_time = end_time - start_time
print(f"\nExecution time without parallelization: {exe_time} seconds")

# Evaluate the model performance
CM = confusion_matrix(y_pred, y_test)  # Confusion Matrix
print("\nConfusion Matrix:")
print(CM)

AS = accuracy_score(y_pred, y_test)  # Accuracy Score
print(f"\nAccuracy Score: {AS}")

CR = classification_report(y_pred, y_test)  # Classification Report (precision, recall, f1-score)
print("\nClassification Report:")
print(CR)
Any Opioid
1.0    8828
0.0    3034
Name: count, dtype: int64

Before handling NaN in 'Any Opioid':
119

After handling NaN in 'Any Opioid':
0

Execution time without parallelization: 0.28926849365234375 seconds

Confusion Matrix:
[[   4    2]
 [ 978 2611]]

Accuracy Score: 0.7273991655076495

Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.67      0.01         6
         1.0       1.00      0.73      0.84      3589

    accuracy                           0.73      3595
   macro avg       0.50      0.70      0.43      3595
weighted avg       1.00      0.73      0.84      3595

In [7]:
pip install graphviz
Requirement already satisfied: graphviz in c:\users\sanja\downloads\anaconda\lib\site-packages (0.20.3)
Note: you may need to restart the kernel to use updated packages.
In [8]:
from sklearn.tree import export_graphviz
import graphviz
In [9]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[9]:
RandomForestClassifier(n_estimators=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [10]:
len(rf.estimators_)
Out[10]:
1
In [12]:
from sklearn import tree
import matplotlib.pyplot as plt
X = data[['Age', 'Sex']]  # Features (Age and Sex)
y = data['Any Opioid'] 
plt.figure(figsize=(45,35))
_=tree.plot_tree(rf.estimators_[0], filled=True)
No description has been provided for this image
In [13]:
%matplotlib inline
plt.figure(figsize=(25,15))  # Increase figure size
tree.plot_tree(rf.estimators_[0], filled=True, feature_names=X.columns, rounded=True, fontsize=12)
plt.show()
No description has been provided for this image
In [18]:
import pandas as pd
import time
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Initialize Dask client for parallelization
client = Client(n_workers=6)

# Load the dataset
data = pd.read_csv('Accidental_Drug_Related_Deaths_2012-2023.csv')

# Convert 'Sex' column: Male -> 0, Female -> 1
data['Sex'] = data['Sex'].map({'Male': 0, 'Female': 1})

# Convert 'Any Opioid' column: Replace NaN/Null with 'No', and 'Yes' stays as is
data['Any Opioid'] = data['Any Opioid'].fillna('No')  # Replace NaN with 'No'

# Map 'Yes' to 1 and 'No' to 0 for 'Any Opioid'
data['Any Opioid'] = data['Any Opioid'].map({'Y': 1, 'No': 0})

# Show the unique values of 'Any Opioid'
print(data['Any Opioid'].value_counts())

# Prepare the features and target
X = data[['Age', 'Sex']]  # Features (Age and Sex)
y = data['Any Opioid']    # Target (Any Opioid)

# Before proceeding, ensure 'Any Opioid' doesn't have NaN values
print("\nBefore handling NaN in 'Any Opioid':")
print(y.isna().sum())  # Display the number of NaN values in the target column

# Handle NaN values in the 'Any Opioid' column (Replace NaN with 0 or 'No')
y = y.fillna(0)  # Fill NaN with 0 to indicate 'No'

# Verify that NaN values were handled correctly
print("\nAfter handling NaN in 'Any Opioid':")
print(y.isna().sum())  # Ensure there are no NaN values left in the target column

# Split the dataset into training and testing sets (70% training, 30% testing) using Dask
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

# Record the start time for measuring execution time
start_time = time.time()

# Initialize the Random Forest Classifier with 50 estimators (trees)
clf = RandomForestClassifier(n_estimators=50)

# Train the model using the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Record the end time to calculate the execution time
end_time = time.time()

# Calculate the execution time
exe_time = end_time - start_time
print(f"\nExecution time with Dask parallelization: {exe_time} seconds")

# Evaluate the model performance
CM = confusion_matrix(y_pred, y_test)  # Confusion Matrix
print("\nConfusion Matrix:")
print(CM)

AS = accuracy_score(y_pred, y_test)  # Accuracy Score
print(f"\nAccuracy Score: {AS}")

CR = classification_report(y_pred, y_test)  # Classification Report (precision, recall, f1-score)
print("\nClassification Report:")
print(CR)

# Close the Dask client
client.close()
C:\Users\sanja\Downloads\anaconda\Lib\site-packages\distributed\node.py:187: UserWarning: Port 8787 is already in use.
Perhaps you already have a cluster running?
Hosting the HTTP server on port 59682 instead
  warnings.warn(
C:\Users\sanja\Downloads\anaconda\Lib\contextlib.py:144: UserWarning: Creating scratch directories is taking a surprisingly long time. (1.46s) This is often due to running workers on a network file system. Consider specifying a local-directory to point workers to write scratch data to a local disk.
  next(self.gen)
Any Opioid
1.0    8828
0.0    3034
Name: count, dtype: int64

Before handling NaN in 'Any Opioid':
119

After handling NaN in 'Any Opioid':
0

Execution time with Dask parallelization: 0.8758988380432129 seconds

Confusion Matrix:
[[   7    7]
 [ 975 2606]]

Accuracy Score: 0.7268428372739917

Classification Report:
              precision    recall  f1-score   support

         0.0       0.01      0.50      0.01        14
         1.0       1.00      0.73      0.84      3581

    accuracy                           0.73      3595
   macro avg       0.50      0.61      0.43      3595
weighted avg       0.99      0.73      0.84      3595

In [19]:
from sklearn.tree import export_graphviz
import graphviz
In [20]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[20]:
RandomForestClassifier(n_estimators=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [21]:
len(rf.estimators_)
Out[21]:
1
In [22]:
from sklearn import tree
X = data[['Age', 'Sex']]  # Features (Age and Sex)
y = data['Any Opioid'] 
plt.figure(figsize=(45,35))
_=tree.plot_tree(rf.estimators_[0], filled=True)
No description has been provided for this image
In [23]:
%matplotlib inline
plt.figure(figsize=(25,15))  # Increase figure size
tree.plot_tree(rf.estimators_[0], filled=True, feature_names=X.columns, rounded=True, fontsize=12)
plt.show()
No description has been provided for this image
In [ ]: